import pandas as pd
import numpy as np
import matplotlib as mpl
import seaborn as sns
df=pd.read_csv("C:/Users/Lenovo/Desktop/tp/date_index.csv", low_memory=False, index_col="Unnamed: 0", na_values="NA")
df.head()
import matplotlib.pyplot as plt
plt.figure(figsize=(10, 6))
sns.set(style= 'whitegrid', font_scale=1.2)
sns.distplot(df.revenue);
df.info()
df.describe()
#dealing with % variables
df['bounce']=df['bounce'].str.rstrip('%').astype('float') / 100.0 #converting the %str to floats
df['new_visitors_share']=df['new_visitors_share'].str.rstrip('%').astype('float') / 100.0 #converting the %str to floats
df['dns_percent']=df['dns_percent'].str.rstrip('%').astype('float') / 100.0
#dealing with session duration
for i in range(1,len(df['session_duration'])+1):
h, m, s = df.loc[i,'session_duration'].split(':')
df.loc[i,'session_duration']=int(h) * 3600 + int(m) * 60 + int(s)
df['session_duration']=df['session_duration'].astype('int')
#dealing with seasons
df['season']=0
for i in range(1,len(df['date'])+1):
y, m, d = df.loc[i,'date'].split('-')
if 3<=int(m)<=5:
df.loc[i,'season']="spring"
elif 6<=int(m)<=8:
df.loc[i,'season']="summer"
elif 9<=int(m)<=11:
df.loc[i,'season']="fall"
else:
df.loc[i,'season']="winter"
df=df.drop(['date',"google", 'mailru', 'yandex', 'facebook', 'X1', 'targetvk', 'targetinst', 'tv', 'software', 'targetmail'], axis=1)
df['season'].value_counts()
df.groupby('season')['visits_all',"session_duration", 'revenue', 'transactions'].mean()
sns.set(style='whitegrid', font_scale=1.5)
sns.catplot(x="season", y='revenue', data=df, height=9);
sns.set(style='whitegrid', font_scale=1.5)
g = sns.relplot(x="transactions", y="revenue", hue="season", data=df, height=10)
#those 2 dots look like outliers
#deleting outliers
df=df.drop(df.index[826], axis=0)
df=df.drop(df.index[455], axis=0)
mis_col=[col for col in df.columns if df[col].isnull().any()]
df['transactions']=df['transactions'].fillna(0)
df['revenue']=df['revenue'].fillna(0)
df['avg_visit_revenue']=df['avg_visit_revenue'].fillna(0)
df['avg_bill']=df['avg_bill'].fillna(0)
from sklearn.impute import SimpleImputer
imp = SimpleImputer(strategy='mean') #initiate the imputer with most frequent values
df[mis_col] = imp.fit_transform(df[mis_col])
#better looking plot
sns.set(style='whitegrid', font_scale=1.5)
g = sns.relplot(x="visits_all", y="revenue", hue="season", data=df, height=10)
sns.set(style='whitegrid', font_scale=1.5)
sns.relplot(x="visits_all", y="avg_visit_revenue", data=df, height=9, color='tomato');
plt.figure(figsize=(10, 6))
sns.set(style= 'whitegrid', font_scale=1.2)
sns.distplot(df.avg_bill); #skewed a bit to the right, but nice looking (log is terrible)
sns.set(style='whitegrid', font_scale=1.5)
sns.catplot(x="season", y='revenue', data=df, height=9);
import matplotlib.pyplot as plt
#correlation matrix
corrmat = df.corr()
f, ax = plt.subplots(figsize=(16, 10))
sns.heatmap(corrmat,vmin=-.8, vmax=.8, square=True, cmap='RdBu_r')
sns.pairplot(df.loc[:,['visits_all', "unique_visitors", "bounce", "session_duration", 'revenue','season']], hue="season", height=3);
df.info()
corrmat['revenue'].sort_values(ascending=False)#check features correlated with lnSalePrice
corrval = corrmat.unstack()
corrval[(abs(corrval) > 0.6) & (abs(corrval) < 1)] #check the correlated features
df_dummy=pd.get_dummies(df)
df_dummy.head()
df_dummy=df_dummy.drop(['transactions', 'avg_visit_revenue'], axis=1)
y = df_dummy.revenue
X=df_dummy.drop(['revenue', "avg_bill"], axis=1)
X.head()
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
train_X, test_X, train_y, test_y = train_test_split(X, y, test_size = 0.25, random_state = 0)
lr = LinearRegression() #initiate the model with default parameters
lr.fit(train_X,train_y)
lr.predict(test_X)
auc_lr=lr.score(test_X, test_y)
print("Accuracy of linear regression on test set:" , auc_lr)
y = df_dummy.revenue
X=df_dummy.drop(['revenue', "avg_bill"], axis=1)
X=X.drop(X.iloc[:, 37:49], axis=1)
X=X.drop("tourism", axis=1)
X.info()
#linear regression without interest
train_X, test_X, train_y, test_y = train_test_split(X, y, test_size = 0.25, random_state = 0)
lr = LinearRegression() #initiate the model with default parameters
lr.fit(train_X,train_y)
lr.predict(test_X)
auc_lr=lr.score(test_X, test_y)
print("Accuracy of linear regression on test set (no interest col):" , auc_lr)
plt.figure(figsize=(20,10))
pd.Series(lr.coef_,index=X.columns).sort_values().plot(kind = "barh");
#horror
lr.score(X,y) #the coefficient of determination R^2 of the prediction (train set)
import statsmodels.api as sm
from scipy import stats
est = sm.OLS(y, X)
est2 = est.fit()
print(est2.summary())
from sklearn.linear_model import Ridge
r = Ridge(solver='lsqr') #regularized least squares iterative estimator
r.fit(X,y)
r.score(X,y)
from sklearn.linear_model import LassoCV
from sklearn.datasets import make_regression
reg = LassoCV()
reg.fit(X, y)
print("Best alpha using built-in LassoCV: %f" % reg.alpha_)
print("Best score using built-in LassoCV: %f" %reg.score(X,y))
coef = pd.Series(reg.coef_, index = X.columns)
print("Lasso picked " + str(sum(coef != 0)) + " variables and eliminated the other " + str(sum(coef == 0)) + " variables")
imp_coef = coef.sort_values()
import matplotlib
matplotlib.rcParams['figure.figsize'] = (8.0, 10.0)
imp_coef.plot(kind = "barh");
plt.title("Feature importance using Lasso Model")
# Labels are the values we want to predict
labels = np.array(df_dummy['revenue'])
# Remove the labels from the features
# axis 1 refers to the columns
features= df_dummy.drop(['revenue', 'avg_bill'], axis = 1)
# Saving feature names for later use
feature_list = list(features.columns)
# Convert to numpy array
features = np.array(features)
# Using Skicit-learn to split data into training and testing sets
from sklearn.model_selection import train_test_split
# Split the data into training and testing sets
train_features, test_features, train_labels, test_labels = train_test_split(features, labels, test_size = 0.25, random_state = 42)
print('Training Features Shape:', train_features.shape)
print('Training Labels Shape:', train_labels.shape)
print('Testing Features Shape:', test_features.shape)
print('Testing Labels Shape:', test_labels.shape)
# Import the model we are using
from sklearn.ensemble import RandomForestRegressor
# Instantiate model with 1000 decision trees
rf = RandomForestRegressor(n_estimators =150, random_state = 42)
# Train the model on training data
rf.fit(train_features, train_labels);
ausrf=rf.score(test_features, test_labels)
print("Accuracy of random forest model:{:.4f}".format(ausrf))
# Use the forest's predict method on the test data
predictions = rf.predict(test_features)
# Calculate the absolute errors
errors = abs(predictions - test_labels)
# Print out the mean absolute error (mae)
mae_init=np.mean(errors)
print('Mean Absolute Error:', mae_init, 'degrees.')
from sklearn import metrics
mse_init= metrics.mean_squared_error(test_labels, predictions)
print('Mean Squared Error:',mse_init)
rmse_init=np.sqrt(metrics.mean_squared_error(test_labels, predictions))
print('Root Mean Squared Error:',rmse_init )
# Try different numbers of n_estimators - this will take a minute or so
estimators = np.arange(10, 200, 10)
scores = []
plt.style.use('seaborn-whitegrid')
plt.rcParams.update({'font.size': 25})
plt.figure(figsize=(15,10))
for n in estimators:
rf.set_params(n_estimators=n)
rf.fit(train_features, train_labels)
scores.append(rf.score(test_features, test_labels))
#plt.title("Effect of n_estimators")
plt.xlabel("n_estimator", fontsize=17)
plt.ylabel("score", fontsize=17)
plt.xticks(fontsize=14)
plt.yticks(fontsize=14)
plt.plot(estimators, scores);
from sklearn.tree import export_graphviz
import os
os.environ["PATH"] += os.pathsep + 'C:/Program Files (x86)/Graphviz2.38/bin/' #Pay attention to modifying your path
estimator = rf.estimators_[5]
export_graphviz(estimator,
out_file='tree.dot',
feature_names = feature_list,
rounded = True, proportion = False,
precision = 2, filled = True)
from subprocess import call
call(['dot', '-Tpng', 'tree.dot', '-o', 'tree.png', '-Gdpi=600'])
# Display in jupyter notebook
from IPython.display import Image
Image(filename = 'tree.png')